#include "xab.h"
#include <stdio.h>
#include <math.h>

/*
	Node program for column Cholesky factorization of
	a symmetric matrix partitioned by vertical strips
	using synchronous fan-out broadcast communication
*/

main()
/* node process */
{
	char temp[32];
	int tmp;
	long ti[8] ;
	int n, i, ncols, p, me, alg, *map, *mycols, info[3] ;
	float *b, **col, tm[4], err, maxerr ;

	me=enroll("cholnode");

	whoami(temp,&tmp);
	fprintf(stderr, "I am %s, instance %d\n",temp,tmp);

#ifdef USE_VC
	vrcv(16001);
#else
	rcv(16001);
#endif
	ti[6] = time(0);
	getnint(info,3);

	n = info[0] ;
	alg = info[1] ;
	p = info[2];
	if (n == 0 || n < p) {
		fprintf(stderr,"Node no %d : n=%d; p=%d\n",me,n,p);
		exit(1);
	}
	map = (int *)malloc(n*sizeof(int)) ;

#ifdef USE_VC
	vrcv(16002);
#else
	rcv(16002);
#endif
	getnint(map,n);
	fprintf(stderr, "Received 16002\n");

	ncols = 0 ;
	for ( i = 0 ; i < n; i++ ) if (map[i] == me) ncols++ ;

	/* allocate storage */
	mycols = (int *)malloc(ncols*sizeof(int)) ;
	b = (float *)malloc(ncols*sizeof(float)) ;
	col = (float * *)malloc(ncols*sizeof(float *)) ;

	/* set up data structures for problem to be solved */
	prob1(me, n, ncols, map, mycols, col, b) ;

	/* Cholesky factorization - fan-out synchronous algorithm */
	ti[0] = time(0);
	cholfo(n, ncols, map, mycols, col, me, p) ;
	ti[1] = time(0);

	/* forward substitution */
	ti[2] = time(0);
	forsub(n, ncols, map, mycols, col, b, me,p) ;
	ti[3] = time(0);

	/* back substitution */
	ti[4] = time(0);
	backsub(n, ncols, map, mycols, col, b, me,p) ;
	ti[5] = time(0);

	/* compute maximum error in solution and send to cholhost */
	maxerr = 0 ;
	for ( i = 0 ; i < ncols ; i++ ) {
		err = b[i] ;
		err -= mycols[i]+1 ;
		maxerr = maxerr < fabs(err) ? fabs(err) : maxerr ;
	}
	initsend();
	putnfloat(&maxerr,1);
#ifdef USE_VC
	vsnd("cholhost",0,16003);
#else
	snd("cholhost",0,16003);
#endif

	/* send local times to host */
	ti[7] = time(0);
	initsend();
	for ( i = 0 ; i < 4 ; i++ ) {
		tm[i] = ti[2*i+1]-ti[2*i] ;
		putnfloat(&tm[i],1);
	}
#ifdef USE_VC
	vsnd("cholhost",0,16004);
#else
	snd("cholhost",0,16004);
#endif

	/* free storage */
	for ( i = 0 ; i < ncols ; i++ ) free(col[i]) ;
	free(col) ;
	free(b) ;
	free(mycols) ;
	free(map) ;
	
	leave();
	exit(0);
}


prob1 ( me, n, ncols, map, mycols, col, b )
	int me, n, ncols, *map, *mycols ;
	float **col, *b ;

/*
 *  Set up data structures for problem to be solved
 */
{
	int i, j, k ;
	float *p, t ;

	j = 0 ;
	for ( k = 0 ; k < n ; k++ )
		if ( map[k] == me ) {  
			p = (float *)malloc((n-k)*sizeof(float)) ;
			col[j] = p ;
			mycols[j] = k ;
			j++ ;
			if ( k == 0 ) {
				t = n ;
				*p++ = t*(t+1)/2 ;
				for ( i = 1 ; i < n ; i++ ) *p++ = -1 ;
			}
			else {
				*p++ = 2 ;
				for ( i = 1 ; i < n-k ; i++ ) *p++ = 0 ;
			}
		}
		for ( i = 0 ; i < ncols ; i++ ) b[i] = 2*mycols[i]+1 ;
}

cholfo ( n, ncols, map, mycols, col ,me,nprocs)
	int n, ncols, *map, *mycols ,me;
	float **col ;
	int nprocs;

/* synchronous Cholesky factorization */
{
	int  i, j, k ;
	float t, *p, *q, *colk ;

	colk = (float *)malloc(n*sizeof(float)) ;
	j = 0 ;
	for ( k = 0 ; k < n ; k++ ) {
		if (map[k] == me) {
			t = 1.0/sqrt(*col[j]) ;
			for ( p = col[j] , q = colk ; p < col[j]+n-mycols[j] ;
				p++ , q++ ) {
				*p *= t ;
				*q = *p ;
			}
			j++ ;
		}
		if (map[k] == me) {
			if (nprocs > 1) {
				initsend();
				putnfloat(colk,(n-k));
#ifdef USE_VC
				vsnd("cholnode",-1,k); 
#else
				snd("cholnode",-1,k); 
#endif
			}
		}
		else {
#ifdef USE_VC
			vrcv(k);
#else
			rcv(k);
#endif
			getnfloat(colk,(n-k));
		}

		for ( i = j ; i < ncols ; i++ ) {
			for ( p = col[i] , q = colk+mycols[i]-k , t = *q ;
				p < col[i]+n-mycols[i] ; p++ , q++ )
					*p -= *q * t ;
		}
	}
	free(colk) ;
}


forsub ( n, ncols, map, mycols, col, b ,me,nprocs)
	int me, n, ncols, *map, *mycols ;
	float **col, *b ;
	int nprocs;

/* lower triangular forward solve */
/* column-oriented fan-in algorithm */

{
	int  i, j, k ;
	float t ,t2;

	j = 0 ;
	if (map[0] == me) {
		b[j] /= *col[j] ;
		j++ ;
	}
	for ( k = 1 ; k < n ; k++ ) {
		t = 0 ;
		for ( i = 0 ; i < j ; i++ ) t += b[i] * *(col[i]+k-mycols[i]) ;
		if (map[k] == me) {
			for (i=0;i<(nprocs-1);i++) {
#ifdef USE_VC
				vrcv(k);
#else
				rcv(k);
#endif
				getnfloat(&t2,1);
				t+=t2;
			}
		}
		else {
			initsend();
			putnfloat(&t,1);
#ifdef USE_VC
			vsnd("cholnode",map[k],k);
#else
			snd("cholnode",map[k],k);
#endif
		}

		if (map[k] == me) {
			b[j] -= t ;
			b[j] /= *col[j] ;
			j++ ;
		}
	}
}


backsub ( n, nrows, map, myrows, row, b ,me, nprocs)
	int me, n, nrows, *map, *myrows ;
	float **row, *b ;
	int nprocs;

/* upper triangular backward solve */
/* row-oriented fan-out algorithm */

{
	int host, dir, i, j, k ;
	float x ;

	j = nrows-1 ;
	for ( k = n-1 ; k > 0 ; k-- ) {
		if (map[k] == me) {
			b[j] /= *row[j] ;
			x = b[j] ;
			j-- ;
		}
		if (map[k] == me) {
			initsend(0);
			putnfloat(&x,1);
#ifdef USE_VC
			vsnd("cholnode",-1,k);
#else
			snd("cholnode",-1,k);
#endif
		}
		else {
#ifdef USE_VC
			vrcv(k);
#else
			rcv(k);
#endif
			getnfloat(&x,1);
		}
		for ( i = j ; i >= 0 ; i-- ) b[i] -= x * *(row[i]+k-myrows[i]) ;
	}
	if (map[0] == me) b[j] /= *row[j] ;
}

